#!/usr/bin/env python3

# The point of this module is to apply some transformation to the output of
# mandoc, in order to improve the HTML

import bleach
import glob
import os
import re
import sys
from pyquery import PyQuery as pq

# Loop HTML files (mandoc output)
for file in glob.iglob ('man/**/*.html', recursive=True):
    
    print (file)
    
    with open (file, 'rt') as f:
        html = f.read ()
    
    # Replace references to other manpages with links.
    # Example: ls(1)    ->    <a href="">ls</a>(1)
    # regex:
    #    - some manpages use a bold or italic name, so we match an optional <strong>
    #      or <em> tag
    #    - lazy match valid characters for a manpage, assign name <page>
    #    - match optional closing </em> or </strong> tags
    #    - match '('
    #    - match number, then lazy match anything except a space, assign name <section>
    #    - match ')'
    html = re.sub (
        '(?:<em>|<strong>)?(?P<page>[a-zA-Z0-9._]+?)(?:</em>|</strong>)?\((?P<section>[0-9][^\s]*?)\)',
        
        lambda match:
            '<span class="citerefentry">' +
                '<span class="refentrytitle">' + 
                    '<a href="' + match.group ('page') + '.'
                                + match.group ('section') + '">' +
                        match.group ('page') +
                    '</a>' + 
                '</span>' +
                '<span class="manvolnum">(' + match.group ('section') + ')</span>' +
            '</span>'
            
            # Only make link if the targeted html manpage exist
            if os.path.isfile (
                './man/man' + match.group ('section') + '/' + match.group ('page') + '.' +
                match.group ('section') + '.html')
            else ' ' + match.group ('page') + '(' + match.group ('section') + ')',
        
        html)
    
    # Unfortunately mandoc outputs all <tags> one below the other, so there is no way
    # to separate a section title to its content. This is bad because it makes it
    # impossible to add margin-left to the content of a section from CSS.
    # So, this hack is to try to fix the issue. Since all section titles have the form
    # <h1 class="Sh" title="Sh" id="NAME"><a class="selflink" href="#NAME">NAME</a>
    # and since apparently only section titles use <h1> tags, we try to wrap all
    # content between a </h1> and the next <h1> into a <div>.
    # regex:
    #     - look for a </h1> tag (and of section title)
    #     - match everything next
    #     - look for the next <h1> (start of new section title)
    #       <table class="foot"> is used to match the end of the last section
    """
    html = re.sub (
        '(?<=</h1>)(?P<content>.*?)(?=<h1|<table class="foot">)',
        
        lambda match: '<div class="Sh_content">' + match.group ('content') + '</div>',
        
        html, flags=re.DOTALL)
    """
    
    with open (file, 'wt') as f:
        f.write (html)
